---
title: "Data preparation"
output: html_document
---

# Setup

```{r save prepared data for analysis}
export_prepared_data <- TRUE
```


```{r load packages}
# Load packages
p_needed <- c("foreign", "data.table", "stringr", "quanteda", "stargazer", "reshape2", "nnet", "MASS", "knitr", "sjPlot", "tictoc", "haven")
packages <- rownames(installed.packages())
p_to_install <- p_needed[!(p_needed %in% packages)]
if (length(p_to_install) > 0) {install.packages(p_to_install)}
sapply(p_needed, require, character.only = TRUE)

rm("p_needed", "packages", "p_to_install")
```

```{r load my functions}
for(i in list.files("Functions")) {
  source(paste0("Functions/",i))
}
```

# Load data

```{r load data}
# Load candidate tweets collected with the identifiers provided by Stier et al. (2018)
load("Data/candidate_tweets_of_stier_et_al.RData") # object is called candidates_tweets

# Load tweets provided online to download by Kratzke (2017) 
load("Data/Pol_tweets_kratzke.RData") # object is called df_pol
```

# Prepare dictionaries

```{r Prepare dictionaries}
# Load party specific dictionaries
afd <- t(read.table("Data/Dictionaries/dictionary_afd.txt", header=F, fill=T, sep=",", row.names = 1, col.names = c(1:40), encoding = "latin1"))
cdu <- t(read.table("Data/Dictionaries/dictionary_cducsu.txt", header=F, fill=T, sep=",", row.names = 1, col.names = c(1:40), encoding = "latin1"))
fdp <- t(read.table("Data/Dictionaries/dictionary_fdp.txt", header=F, fill=T, sep=",", row.names = 1, col.names = c(1:40), encoding = "latin1"))
spd <- t(read.table("Data/Dictionaries/dictionary_spd.txt", header=F, fill=T, sep=",", row.names = 1, col.names = c(1:40), encoding = "latin1"))
left <- t(read.table("Data/Dictionaries/dictionary_left.txt", header=F, fill=T, sep=",", row.names = 1, col.names = c(1:40), encoding = "latin1"))
green <- t(read.table("Data/Dictionaries/dictionary_green.txt", header=F, fill=T, sep=",", row.names = 1, col.names = c(1:40), encoding = "latin1"))

# Dictionary based on all parties
general_dic <- t(read.csv("Data/Dictionaries/general_dictionary.txt", header=F, fill=T, sep=",", row.names = 1, col.names = c(1:60), encoding = "UTF-8"))
# Small additions made to the dictionary to balance agriculture and party competition
addition_to_dic <- t(read.table("Data/Dictionaries/original_addition.txt", header=F, fill=T, sep=",", row.names = 1, col.names = c(1:35), encoding = "UTF")) 
# words moved to a different category where the word are more plausible
moved_by_hand_dic <- t(read.table("Data/Dictionaries/moved_to_general.txt", header=F, fill=T, sep=",", row.names = 1, col.names = c(1:7), encoding = "latin1"))

# Combine the three general dictionaries with the party specific dictionaries to individual party dictionaries
afd <- rbind(afd, general_dic, addition_to_dic, moved_by_hand_dic)
cdu <- rbind(cdu, general_dic, addition_to_dic, moved_by_hand_dic)
fdp <- rbind(fdp, general_dic, addition_to_dic, moved_by_hand_dic)
spd <- rbind(spd, general_dic, addition_to_dic, moved_by_hand_dic)
left <- rbind(left, general_dic, addition_to_dic, moved_by_hand_dic)
green <- rbind(green, general_dic, addition_to_dic, moved_by_hand_dic)


afd <- apply(apply(afd, 2, function(x) {ifelse(x=="", NA, x)}), 2, trimws)
cdu <- apply(apply(cdu, 2, function(x) {ifelse(x=="", NA, x)}), 2, trimws)
fdp <- apply(apply(fdp, 2, function(x) {ifelse(x=="", NA, x)}), 2, trimws)
spd <- apply(apply(spd, 2, function(x) {ifelse(x=="", NA, x)}), 2, trimws)
left <- apply(apply(left, 2, function(x) {ifelse(x=="", NA, x)}), 2, trimws)
green <- apply(apply(green, 2, function(x) {ifelse(x=="", NA, x)}), 2, trimws)

all_dic <- list("afd"=afd,"cducsu"=cdu,"fdp"=fdp,"spd"=spd,"left"=left,"greens"=green)


dict_cat <- colnames(afd) # List of policy areas
dict_cat_wo_pc <- dict_cat[-7] # List of policy areas without party competition

rm("general_dic", "addition_to_dic", "moved_by_hand_dic")

if(export_prepared_data) {save(dict_cat, file="Data_processed_for_analysis/dictionary_categories.RData")}
if(export_prepared_data) {save(dict_cat_wo_pc, file="Data_processed_for_analysis/dictionary_categories_without_party_competition.RData")}
```


# Merge Tweets collected from Stier et al. with data set from Kratzke

```{r}
# Add accounts from Kratzke that were not included in Stier et al.
screen_names_stier <- unique(candidates_tweets$screen_name_tolower)
screen_names_kratzke <- unique(df_pol$screen_name)
table(screen_names_kratzke%in%screen_names_stier)
kratzke_but_not_stier <- screen_names_kratzke[!screen_names_kratzke%in%screen_names_stier]

# The resulting list of candidates stored in "kratzke_but_not_stier" was inspected manually and every account was checked whether they ran for office again. This resulted in the list of candidates to add (cand_to_add) to the general data set.

cand_to_add <- c("georg_pazderski", "hoffmannmdb", "zieglermdb", "fostendorff", "jenszimmermann1", "manfred_grund", "kudlaleipzig", "kerstin_tack", "marcusheld_spd", "ostermannmdb", "FlorianPost", "marcelklingevs", "spinrathnorbert", "martinrabanus")

kratzke_additional_tweets <- df_pol[df_pol$screen_name %in% cand_to_add,]
rm(df_pol)

# Restrict the tweets to the time span of the main data source
kratzke_additional_tweets$strptime_tweet_created <- str_replace_all(as.character(kratzke_additional_tweets$created_at_tweet), "0000 ", "")
kratzke_additional_tweets$strptime_tweet_created <- str_replace_all(as.character(kratzke_additional_tweets$strptime_tweet_created), "\\+", "")
Sys.setlocale("LC_TIME", "C")
kratzke_additional_tweets$strptime_tweet_created <- strptime(kratzke_additional_tweets$strptime_tweet_created, format="%a %b %e %H:%M:%S %Y")
kratzke_additional_tweets <- kratzke_additional_tweets[kratzke_additional_tweets$strptime_tweet_created>=min(candidates_tweets$created_at),] 


kratzke_additional_tweets$created_at <- kratzke_additional_tweets$strptime_tweet_created
kratzke_additional_tweets$status_id <- kratzke_additional_tweets$id_str_tweet
kratzke_additional_tweets$user_id <- as.character(kratzke_additional_tweets$id_str)
kratzke_additional_tweets$screen_name_tolower <- kratzke_additional_tweets$screen_name
drop <- c("created_at_tweet", "id_tweet", "id_str_tweet", "favorited", "retweeted", "filter_level", "timestamp_ms", "id", "id_str", "screen_name", "strptime_tweet_created", "party")
kratzke_additional_tweets <- kratzke_additional_tweets[,!colnames(kratzke_additional_tweets) %in% drop]
kratzke_additional_tweets <- kratzke_additional_tweets[,-which(names(kratzke_additional_tweets) %in% c("lang.1"))]

kratzke_additional_tweets[setdiff(names(candidates_tweets), names(kratzke_additional_tweets))] <- NA
candidates_tweets <- rbind(candidates_tweets,kratzke_additional_tweets)

rm("screen_names_stier", "screen_names_kratzke", "kratzke_but_not_stier", "cand_to_add", "kratzke_additional_tweets", "drop")

```

# Add meta data to candidates tweets

```{r Add meta data to candidates tweets and orgs tweets}
# Add candidate meta data (available online from Stier et al. 2018) to the tweet data
candidates_meta <- fread("Data/candidates_utf8_stier_et_al.csv", encoding="UTF-8", data.table = TRUE)
cand_list <- str_split(candidates_meta$twlink, "twitter.com/")
candidates_meta$twitter_name <- unlist(lapply(cand_list, function(l) l[2]))
candidates_meta$twitter_name_tolower <- tolower(candidates_meta$twitter_name)
candidates_meta$full_name <- paste(candidates_meta$firstname, candidates_meta$name)

candidates_meta$age <- 2017-candidates_meta$age

candidates_meta$partyname <- NA
candidates_meta$partyname[candidates_meta$party==1] <- "cducsu" # CDU
candidates_meta$partyname[candidates_meta$party==2] <- "spd"
candidates_meta$partyname[candidates_meta$party==3] <- "left"
candidates_meta$partyname[candidates_meta$party==4] <- "greens"
candidates_meta$partyname[candidates_meta$party==5] <- "cducsu" # CSU
candidates_meta$partyname[candidates_meta$party==6] <- "fdp"
candidates_meta$partyname[candidates_meta$party==7] <- "afd"

candidates_tweets <- merge(candidates_tweets, candidates_meta, all.x = TRUE, by.x="screen_name_tolower", by.y="twitter_name_tolower")
candidates_tweets <- candidates_tweets[!is.na(candidates_tweets$partyname),]


# Remove unnecessary columns for faster computation time
my_vars <- !names(candidates_tweets) %in% c("symbols", "coords_coords",
                                            "urls_url", "urls_t.co", "urls_expanded_url",
                                            "media_url", "media_t.co","media_expanded_url",
                                            "media_type",  "ext_media_url", "ext_media_t.co",
                                            "ext_media_expanded_url", "ext_media_type", 
                                            "quoted_status_id","quoted_text","quoted_created_at",
                                            "quoted_source","quoted_favorite_count", "quoted_retweet_count",
                                            "quoted_user_id","quoted_screen_name","quoted_name",
                                            "quoted_followers_count","quoted_friends_count","quoted_statuses_count",
                                            "quoted_location","quoted_description","quoted_verified",
                                            "place_url","place_name","place_full_name","place_type",
                                            "geo_coords", "coords_coords", "bbox_coords", "protected")
candidates_tweets <- candidates_tweets[my_vars]

rm("cand_list","my_vars")
```

# Classify tweets based on dictionary

```{r Preperation for the classification}
# words to remove from tweets
words_to_remove <- c("dass", "wir", "ab", "für", "wollen", "müssen", "auf", "innen",
                     "sollen", "dafür","mehr", "unser*", "dabei", "seit", "deshalb",
                     "viel", "setzen", "besser", "immer", "viel*", 
                     "daher", "insbesond*", "wichtig", "sowie")

#Remove the hashtag symbol from the text
candidates_tweets$text <- str_replace_all(candidates_tweets$text, "#", "") 


# Create one data frame with all dictionaries and parties to store frequency of every word later on
all_dic_long <- data.frame(matrix(ncol=3, nrow=0))
for(i in 1:length(all_dic)) {
  all_dic_long <- rbind(all_dic_long, dictionary_into_long_format(all_dic[[i]], names(all_dic)[i]))
}

#Add variables for dictionary categories
prep <- data.frame(matrix(0, ncol=ncol(all_dic[[1]])))
colnames(prep) <- dict_cat

candidates_tweets <- cbind(candidates_tweets, prep)

rm("prep")
```


```{r Create dfm for each party}
twit_dfm_each_party <- vector("list", 6)

# This loop can take a moment (3.5 minutes on my machine)
tic()
for (i in 1:length(twit_dfm_each_party)) {
  twit_corp <- corpus(candidates_tweets[candidates_tweets$partyname==unique(candidates_tweets$partyname)[i],], text_field = "text")
  twit_dfm_each_party[[i]] <- tokens(twit_corp) %>% 
    tokens_remove(pattern=c(stopwords("de"), words_to_remove),
                  padding = TRUE,
                  min_nchar = 2L) %>% 
    tokens_ngrams(n = 1:2) %>%
    dfm(stem = TRUE,
        remove_url = T,
        remove_punct = TRUE) %>%
    dfm_trim(min_termfreq = 2)
}
toc()

nfeat(twit_dfm_each_party[[1]])
```


```{r Classify the tweets based on the dictionary}
res_df <- data.frame(matrix(ncol=length(docvars(twit_dfm_each_party[[1]])), nrow=0))
colnames(res_df) <- names(docvars(twit_dfm_each_party[[1]]))

dic_word_count <- data.frame(matrix(ncol=4, nrow=0))
colnames(dic_word_count) <- c("word", "category", "party", "count")

for (k in 1:length(twit_dfm_each_party)) {
  for (i in 1:ncol(all_dic[names(all_dic)==unique(twit_dfm_each_party[[k]]$partyname)][[1]])) {
    one_party_one_topic <- all_dic[names(all_dic)==unique(twit_dfm_each_party[[k]]$partyname)][[1]][,i]
    one_party_one_topic <- unique(one_party_one_topic[complete.cases(one_party_one_topic)])
    
    one_party_one_topic_dfm <- dfm_select(twit_dfm_each_party[[k]], pattern=one_party_one_topic, selection = "keep")
    classified_row <- rownames(one_party_one_topic_dfm)[rowSums(one_party_one_topic_dfm)!=0]
    select_cat_col <- names(docvars(twit_dfm_each_party[[k]]))==dict_cat[i]
    docvars(twit_dfm_each_party[[k]])[rownames(twit_dfm_each_party[[k]]) %in% classified_row,select_cat_col] <- 1
    
    one_p_one_t_dic <- all_dic_long[all_dic_long$party==unique(twit_dfm_each_party[[k]]$partyname)&all_dic_long$category==dict_cat[i],]
    one_p_one_t_topf <- topfeatures(one_party_one_topic_dfm, 80)
    one_p_one_t_topf <- data.frame("word"=names(one_p_one_t_topf), "count"=as.numeric(one_p_one_t_topf))
    one_p_one_t_freq <- merge(one_p_one_t_dic, one_p_one_t_topf, by.x="word", by.y="word", all.x = TRUE)
    dic_word_count <- rbind(dic_word_count, one_p_one_t_freq)
    
    # print(paste("k",k,"i",i))
    # print(nrow(docvars(twit_dfm_each_party[[k]])))
  }
  res_df <- rbind(res_df, docvars(twit_dfm_each_party[[k]]))
}

res_df$text <- candidates_tweets$text
candidates_tweets <- res_df

dic_word_count$count <- ifelse(is.na(dic_word_count$count),0,dic_word_count$count)

rm("one_party_one_topic", "one_party_one_topic_dfm", "classified_row", "select_cat_col", 
   "one_p_one_t_dic", "one_p_one_t_topf", "one_p_one_t_freq")

if(export_prepared_data) {save(dic_word_count, file="Data_processed_for_analysis/data_word_count_dictionary_words.RData")}
if(export_prepared_data) {save(candidates_tweets, file="Data_processed_for_analysis/data_tweets_classified.RData")}
```

```{r group values for individual politicans and calculate herfindahl index}
vec_of_politicians <- unique(candidates_tweets$screen_name_tolower)
politicians_additional_data <- data.frame(matrix(ncol=7, nrow=length(vec_of_politicians)))
colnames(politicians_additional_data) <- c("screen_name_tolower", "herfindahl_index", "politician_number_of_tweets", "politician_number_of_classified_tweets","politician_share_of_classified_tweets", "politician_number_of_classified_tweets_wo_pc","politician_share_of_classified_tweets_wo_pc")

politicians_additional_data2 <- data.frame(matrix(ncol=length(dict_cat), nrow=length(vec_of_politicians)))
colnames(politicians_additional_data2) <- dict_cat

for (i in 1:length(vec_of_politicians)) {
  # Only tweets of a single_politician
  single_politician <- candidates_tweets[candidates_tweets$screen_name_tolower==vec_of_politicians[i],which(names(candidates_tweets)=="foreign_affairs"):which(names(candidates_tweets)=="agriculture")]
  politicians_additional_data2[i,] <- colSums(single_politician)
  
  # Calculate Herfindahl Index for the single politician
  herfindahl_index <- calc_herfindahl_index(colSums(single_politician[,-which(names(single_politician)=="party_competition")])) 

  politician_number_of_tweets <- nrow(single_politician)
  if(politician_number_of_tweets==1) {
    politician_number_of_classified_tweets <- names(table(ifelse(rowSums(single_politician)==0, 0, 1)))
    politician_share_of_classified_tweets <- names(table(ifelse(rowSums(single_politician)==0, 0, 1)))
    if(sum(single_politician$party_competition)==1) {
      politician_number_of_classified_tweets_wo_pc <- "0"
      politician_share_of_classified_tweets_wo_pc <- "0"
    } else {
      politician_number_of_classified_tweets_wo_pc <- politician_number_of_classified_tweets
      politician_share_of_classified_tweets_wo_pc <- politician_share_of_classified_tweets_wo_pc
    }
  } else {
    if(sum(rowSums(single_politician))==0){
        politician_number_of_classified_tweets <- "0"
        politician_share_of_classified_tweets <- "0"
        politician_number_of_classified_tweets_wo_pc <- "0"
        politician_share_of_classified_tweets_wo_pc <- "0"
    } else {
        politician_number_of_classified_tweets <- table(ifelse(rowSums(single_politician)==0, 0, 1))[names(table(ifelse(rowSums(single_politician)==0, 0, 1)))==1]
        politician_share_of_classified_tweets <- politician_number_of_classified_tweets/sum(table(ifelse(rowSums(single_politician)==0, 0, 1)))
        
        single_politician_wo_pc <- single_politician[-which(names(single_politician)=="party_competition")]
        if(sum(rowSums(single_politician_wo_pc))==0){
          politician_number_of_classified_tweets_wo_pc <- "0"
          politician_share_of_classified_tweets_wo_pc <- "0"
        } else {         
          politician_number_of_classified_tweets_wo_pc <- table(ifelse(rowSums(single_politician_wo_pc)==0, 0, 1))[names(table(ifelse(rowSums(single_politician_wo_pc)==0, 0, 1)))==1]
          politician_share_of_classified_tweets_wo_pc <- politician_number_of_classified_tweets_wo_pc/sum(table(ifelse(rowSums(single_politician)==0, 0, 1)))
        }
    }
  }  
  politicians_additional_data[i,] <- cbind(vec_of_politicians[i], herfindahl_index, politician_number_of_tweets, 
                                           politician_number_of_classified_tweets, politician_share_of_classified_tweets,
                                           politician_number_of_classified_tweets_wo_pc, politician_share_of_classified_tweets_wo_pc)
}


politicians_additional_data[c("herfindahl_index", "politician_number_of_tweets", "politician_number_of_classified_tweets","politician_share_of_classified_tweets", "politician_number_of_classified_tweets_wo_pc","politician_share_of_classified_tweets_wo_pc")] <-
  sapply(politicians_additional_data[c("herfindahl_index", "politician_number_of_tweets", "politician_number_of_classified_tweets","politician_share_of_classified_tweets", "politician_number_of_classified_tweets_wo_pc","politician_share_of_classified_tweets_wo_pc")],
         as.numeric)
politicians_additional_data2 <- sapply(politicians_additional_data2, as.numeric)
politicians_additional_data <- cbind(politicians_additional_data, politicians_additional_data2)

candidates <- merge(candidates_meta, politicians_additional_data, by.x = "twitter_name_tolower", by.y="screen_name_tolower")

rm("politicians_additional_data2", "politicians_additional_data", "herfindahl_index", "single_politician")
```

# Manifesto data

```{r load and prepare data from the manifesto project}
# Data can be downloaded from the website of the Manifesto project
metadata <- read.csv("Data/Manifesto data set/MPDataset_MPDS2019b.csv", encoding = "UTF-8")
metadata <- metadata[metadata$countryname=="Germany"&metadata$coderyear==2017,]
metadata$partyname <- factor(metadata$partyname)

green_manifesto <- read.csv("Data/Manifesto data set/41113_201709.csv", encoding = "UTF-8")
left_manifesto <- read.csv("Data/Manifesto data set/41223_201709.csv", encoding = "UTF-8")
spd_manifesto <- read.csv("Data/Manifesto data set/41320_201709.csv", encoding = "UTF-8")
fdp_manifesto <- read.csv("Data/Manifesto data set/41420_201709.csv", encoding = "UTF-8")
cducsu_manifesto <- read.csv("Data/Manifesto data set/41521_201709.csv", encoding = "UTF-8")
afd_manifesto <- read.csv("Data/Manifesto data set/41953_201709.csv", encoding = "UTF-8")

green_manifesto$partyname <- "Alliance'90/Greens"
green_manifesto$partynameshort <- "greens"
left_manifesto$partyname <- "The Left"
left_manifesto$partynameshort <- "left"
spd_manifesto$partyname <- "Social Democratic Party of Germany"
spd_manifesto$partynameshort <- "spd"
fdp_manifesto$partyname <- "Free Democratic Party"
fdp_manifesto$partynameshort <- "fdp"
cducsu_manifesto$partyname <- "Christian Democratic Union/Christian Social Union"
cducsu_manifesto$partynameshort <- "cducsu"
afd_manifesto$partyname <- "Alternative for Germany"
afd_manifesto$partynameshort <- "afd"

mani_df <- rbind(green_manifesto, left_manifesto, spd_manifesto, fdp_manifesto, cducsu_manifesto, afd_manifesto)
mani_df <- mani_df[,c("content", "cmp_code", "partyname", "partynameshort")]
mani_df$domain <- substr(mani_df$cmp_code,0,1)
mani_df$content <- as.character(mani_df$content)
mani_df$coding <- as.numeric(as.character(mani_df$cmp_code))


# Adding our categories
mani_df$dict_cat <- NA
mani_df$dict_cat[mani_df$coding==101 | mani_df$coding==103.2 | mani_df$coding==107 | mani_df$coding==109] <- 1 # foreign_affairs
mani_df$dict_cat[mani_df$coding==108 | mani_df$coding==110] <- 2 # european_affairs
mani_df$dict_cat[mani_df$coding==104 | mani_df$coding==105 | mani_df$coding==106] <- 3 # defence

mani_df$dict_cat[mani_df$coding==201.1 | mani_df$coding==201.2 | mani_df$coding==202.1 | mani_df$coding== 202.2 | mani_df$coding==202.4] <- 4 # justice
mani_df$dict_cat[mani_df$coding==203 | mani_df$coding==204 | mani_df$coding==301 | mani_df$coding==302 | mani_df$coding== 303] <- 5 # federalism
mani_df$dict_cat[mani_df$coding==305.1 | mani_df$coding==305.2] <- 6 # party_competition

mani_df$dict_cat[mani_df$coding==401 | mani_df$coding==402 | mani_df$coding==406 | mani_df$coding==407 | mani_df$coding==410 ] <- 7 # economics
mani_df$dict_cat[mani_df$coding==403 | mani_df$coding==409 | mani_df$coding==412 | mani_df$coding==413 | mani_df$coding==414 ] <- 8 # finance
mani_df$dict_cat[mani_df$coding==411] <- 9 # technology_infrastucture

mani_df$dict_cat[mani_df$coding==502 | mani_df$coding==506 | mani_df$coding==507] <- 10 # education_culture
mani_df$dict_cat[mani_df$coding==405 | mani_df$coding==504 | mani_df$coding==505 | mani_df$coding==701] <- 11 # labour_social_affairs
mani_df$dict_cat[mani_df$coding==501 | mani_df$coding==416.2] <- 12 # environment

mani_df$dict_cat[mani_df$coding==601.1 | mani_df$coding==602.1] <- 13 # home_affairs
mani_df$dict_cat[mani_df$coding==601.2 | mani_df$coding==602.2] <- 14 # migration
mani_df$dict_cat[mani_df$coding==304 | mani_df$coding==603 | mani_df$coding==604 | mani_df$coding==605.1 | mani_df$coding==605.2 ] <- 15 # interior

mani_df$dict_cat[mani_df$coding==503 | mani_df$coding==606.1 | mani_df$coding==706] <- 16 # equality
mani_df$dict_cat[mani_df$coding==607.1 | mani_df$coding==607.2 | mani_df$coding==607.3 | mani_df$coding==608.1 | mani_df$coding==608.2 | mani_df$coding==608.3] <- 17 # multiculturalism
mani_df$dict_cat[mani_df$coding==703.1] <- 18 # agriculture


# Add labels
mani_df$dict_cat_lab <- NA
mani_df$dict_cat_lab[mani_df$dict_cat==1] <- "foreign_affairs"
mani_df$dict_cat_lab[mani_df$dict_cat==2] <- "european_affairs"
mani_df$dict_cat_lab[mani_df$dict_cat==3] <- "defence" 
mani_df$dict_cat_lab[mani_df$dict_cat==4] <- "justice"
mani_df$dict_cat_lab[mani_df$dict_cat==5] <- "federalism"
mani_df$dict_cat_lab[mani_df$dict_cat==6] <- "party_competition" 
mani_df$dict_cat_lab[mani_df$dict_cat==7] <- "economics"
mani_df$dict_cat_lab[mani_df$dict_cat==8] <- "finance"
mani_df$dict_cat_lab[mani_df$dict_cat==9] <- "technology_infrastucture"
mani_df$dict_cat_lab[mani_df$dict_cat==10] <- "education_culture"
mani_df$dict_cat_lab[mani_df$dict_cat==11] <- "labour_social_affairs"
mani_df$dict_cat_lab[mani_df$dict_cat==12] <- "environment"
mani_df$dict_cat_lab[mani_df$dict_cat==13] <- "home_affairs"
mani_df$dict_cat_lab[mani_df$dict_cat==14] <- "migration"
mani_df$dict_cat_lab[mani_df$dict_cat==15] <- "interior"
mani_df$dict_cat_lab[mani_df$dict_cat==16] <- "equality"
mani_df$dict_cat_lab[mani_df$dict_cat==17] <- "multiculturalism"
mani_df$dict_cat_lab[mani_df$dict_cat==18] <- "agriculture"


rm("green_manifesto", "left_manifesto", "spd_manifesto", "fdp_manifesto", "cducsu_manifesto", "afd_manifesto", "metadata")
```

```{r Run the dictionary over the manifesto data}
prep <- data.frame(matrix(0, ncol=length(dict_cat)))
colnames(prep) <- dict_cat
mani_df <- cbind(mani_df, prep)

mani_dfm_each_party <- vector("list", 6)

t1 <- Sys.time()
for (i in 1:length(mani_dfm_each_party)) {
  mani_corp <- corpus(mani_df[mani_df$partynameshort==unique(mani_df$partynameshort)[i],], text_field = "content")
  mani_dfm_each_party[[i]] <- tokens(mani_corp) %>%
    tokens_remove(pattern=c(stopwords("de"), words_to_remove),
                  padding = TRUE,
                  min_nchar = 2L) %>%
    tokens_ngrams(n = 1:2) %>%
    dfm(stem = TRUE,
        remove_url = T,
        remove_punct = TRUE) %>%
    dfm_trim(min_termfreq = 2)
}
t2 <- Sys.time()
t2-t1 # took 3 seconds

nfeat(mani_dfm_each_party[[1]])


res_df_mani <- data.frame(matrix(ncol=length(docvars(mani_dfm_each_party[[1]])), nrow=0))
colnames(res_df_mani) <- names(docvars(mani_dfm_each_party[[1]]))

dic_word_count_mani <- data.frame(matrix(ncol=4, nrow=0))
colnames(dic_word_count_mani) <- c("word", "category", "party", "count")

names(all_dic) <- c("afd", "cducsu", "fdp", "spd", "left", "greens")
all_dic_long$party <- as.factor(all_dic_long$party)
levels(all_dic_long$party) <- c("afd", "cducsu", "fdp", "greens", "left", "spd")

for (k in 1:length(mani_dfm_each_party)) {
  for (i in 1:ncol(all_dic[names(all_dic)==unique(mani_dfm_each_party[[k]]$partynameshort)][[1]])) {
    one_party_one_topic <- all_dic[names(all_dic)==unique(mani_dfm_each_party[[k]]$partynameshort)][[1]][,i]
    one_party_one_topic <- unique(one_party_one_topic[complete.cases(one_party_one_topic)])

    one_party_one_topic_dfm <- dfm_select(mani_dfm_each_party[[k]], pattern=one_party_one_topic, selection = "keep")
    classified_row <- rownames(one_party_one_topic_dfm)[rowSums(one_party_one_topic_dfm)!=0]
    select_cat_col <- names(docvars(mani_dfm_each_party[[k]]))==dict_cat[i]
    docvars(mani_dfm_each_party[[k]])[rownames(mani_dfm_each_party[[k]]) %in% classified_row,select_cat_col] <- 1

    one_p_one_t_dic <- all_dic_long[all_dic_long$party==unique(mani_dfm_each_party[[k]]$partynameshort)&all_dic_long$category==dict_cat[i],]
    one_p_one_t_topf <- topfeatures(one_party_one_topic_dfm, 80)
    one_p_one_t_topf <- data.frame("word"=names(one_p_one_t_topf), "count"=as.numeric(one_p_one_t_topf))
    if(nrow(one_p_one_t_topf)>0){
    one_p_one_t_freq <- merge(one_p_one_t_dic, one_p_one_t_topf, by.x="word", by.y="word", all.x = TRUE)
    dic_word_count_mani <- rbind(dic_word_count_mani, one_p_one_t_freq)
    }
  }
  res_df_mani <- rbind(res_df_mani, docvars(mani_dfm_each_party[[k]]))
}
res_df_mani$content <- mani_df$content
mani_df <- res_df_mani

rm("res_df_mani")

dic_word_count_mani$count <- ifelse(is.na(dic_word_count_mani$count),0,dic_word_count_mani$count)

if(export_prepared_data) {save(mani_df, file="Data_processed_for_analysis/data_manifesto.RData")}
#if(export_prepared_data) {save(dic_word_count_mani, file="Data_processed_for_analysis/data_word_count_manifesto_words.RData")}
```

# Add reelection probability data

```{r}
# data with reelection probabilities is available on request
data_list <- read_dta("Data/reelec_probs_allcan171819_c_and_b_CIs.dta")
data_list$author <- paste(data_list$Vorname, data_list$Name)
data_list$author <- trimws(data_list$author)
data_list <- data_list[data_list$Jahr>=2012,]
data_list <- data_list[!duplicated(data_list$author, fromLast=T),]

candidates$name_complete <- paste(candidates$firstname, candidates$name) # create full names
table(candidates$name_complete %in% data_list$author) # Looking extremely good!

candidates <- merge(candidates, data_list, by.x = "name_complete", by.y = "author", all.x = TRUE)
```

# Add specialization

```{r}
# load data from committee affiliation (German Ausschuss)
ausschuss <- read.csv("Data/data_committee_members_bt_2017_to 2021.csv", sep =";", encoding="UTF-8")
colnames(ausschuss)[1] <- "Ausschuss"
ausschuss$name_complete <- paste(ausschuss$Abgeordneter_vorname, ausschuss$Abgeordneter_nachname)


ausschuss$Ausschuss <- factor(ausschuss$Ausschuss, levels = c(
  "die Angelegenheiten der Europäischen Union", 
"1. Untersuchungsausschuss" ,
"1. Untersuchungsausschuss des Verteidigungsausschusses" ,
"2. Untersuchungsausschuss" ,
"3. Untersuchungsausschuss" ,
"Arbeit und Soziales" ,
"Auswärtiger Ausschuss" ,
"Bau, Wohnen, Stadtentwicklung und Kommunen" ,
"Bildung, Forschung und Technikfolgenabschätzung" ,
"Bundesfinanzierungsgremium" ,
"Digitale Agenda" ,
"Enquete-Kommission „Berufliche Bildung in der digitalen Arbeitswelt“" ,
"Enquete-Kommission „Künstliche Intelligenz – Gesellschaftliche Verantwortung und wirtschaftliche, soziale und ökologische Potenziale“" ,
"Ernährung und Landwirtschaft" ,
"Familie, Senioren, Frauen und Jugend" ,
"Finanzausschuss" ,
"Gemeinsamer Ausschuss" ,
"Gesundheit" ,
"Gremium nach § 80 des Zollfahndungsdienstgesetzes" ,
"Gremium nach Artikel 13 Absatz 6 des Grundgesetzes" ,
"Haushaltsausschuss" ,
"Inneres und Heimat" ,
"Kommission zur Reform des Bundeswahlrechts und zur Modernisierung der Parlamentsarbeit" ,
"Kultur und Medien" ,
"Menschenrechte und humanitäre Hilfe" ,
"Parlamentarischer Beirat für nachhaltige Entwicklung" ,
"Parlamentarisches Kontrollgremium" ,
"Petitionsausschuss" ,
"Recht und Verbraucherschutz" ,
"Sportausschuss" ,
"Tourismus" ,
"Umwelt, Naturschutz und nukleare Sicherheit" ,
"Verkehr und digitale Infrastruktur" ,
"Vermittlungsausschuss" ,
"Verteidigungsausschuss" ,
"Vertrauensgremium" ,
"Wahlausschuss für die Richter des Bundesverfassungsgerichts" ,
"Wahlprüfung, Immunität und Geschäftsordnung" ,
"Wahlprüfungsausschuss",
"Wirtschaft und Energie",
"wirtschaftliche Zusammenarbeit und Entwicklung"
), labels = c(
  "european_affairs",
"",
"defence",
"",
"",
"labour_social_affairs",
"foreign_affairs",
"technology_infrastucture",
"education_culture",
"finance",
"technology_infrastucture",
"",
"",
"agriculture",
"equality",
"finance",
"federalism",
"finance",
"",
"",
"finance",
"interior",
"",
"education_culture",
"migration",
"environment",
"",
"",
"justice",
"",
"",
"environment",
"technology_infrastucture",
"federalism",
"defence",
"",
"",
"",
"",
"economics",
"foreign_affairs"
))
table(ausschuss$Ausschuss)

ausschuss$Ausschuss <- as.character(ausschuss$Ausschuss)
ausschuss$Ausschuss[ausschuss$Ausschuss==""] <- "irrelevant"
ausschuss$Ausschuss <- paste0("ausschuss_", ausschuss$Ausschuss)

ausschuss <- ausschuss[,c("Ausschuss", "Partei", "name_complete")]
ausschuss$dummy <- 1
ausschuss <- ausschuss[!duplicated(ausschuss),]
ausschuss <- tidyr::pivot_wider(ausschuss, id_cols = c("name_complete","Partei"), names_from = "Ausschuss", values_from = "dummy", values_fill = list(dummy = 0))
ausschuss <- data.frame(ausschuss)
ausschuss <- ausschuss[!duplicated(ausschuss$name_complete),]
candidates <- merge(candidates, ausschuss, by="name_complete", all.x = T)
candidates <- candidates[,-"ausschuss_irrelevant"]

candidates$any_ausschuss <- rowSums(candidates[,which(colnames(candidates)=="ausschuss_labour_social_affairs"):which(colnames(candidates)=="ausschuss_economics")], na.rm=T)>0

table(rowSums(candidates[,which(colnames(candidates)=="ausschuss_labour_social_affairs"):which(colnames(candidates)=="ausschuss_economics")], na.rm=T)>0)
# 237 politicans with a resort we are using
table(rowSums(candidates[,which(colnames(candidates)=="ausschuss_labour_social_affairs"):which(colnames(candidates)=="ausschuss_economics")], na.rm=T)==1)
table(rowSums(candidates[,which(colnames(candidates)=="ausschuss_labour_social_affairs"):which(colnames(candidates)=="ausschuss_economics")], na.rm=T)==2)
table(rowSums(candidates[,which(colnames(candidates)=="ausschuss_labour_social_affairs"):which(colnames(candidates)=="ausschuss_economics")], na.rm=T)==3)
table(rowSums(candidates[,which(colnames(candidates)=="ausschuss_labour_social_affairs"):which(colnames(candidates)=="ausschuss_economics")], na.rm=T)>3)
```

```{r}
candidates$sex_factor <- factor(candidates$sex, levels=c(0,1), labels=c("female", "male"))
```

```{r}
if(export_prepared_data) {save(candidates, file="Data_processed_for_analysis/data_candidates.RData")}
```
